Mindmap Hypotheses

Mindmap Hypotheses

Importing Needed packages

library(tidyverse)
library(lubridate)
library(e1071) 
library(gridExtra)
library(janitor)
library(data.table)
library(funModeling)
library(kableExtra)
library(htmltools)
library(ISOweek)
library(vcd)
library(plotly)
library(timetk)
library(ggpubr)
library(Metrics)
library(tidymodels)


theme_set(theme_minimal()) 

Helper Functions

catcor <- function(x, type=c("cramer", "phi", "contingency")) {
    require(vcd)
    nc <- ncol(x)
    v <- expand.grid(1:nc, 1:nc)
    type <- match.arg(type)
    res <- matrix(mapply(function(i1, i2) assocstats(table(x[,i1],
        x[,i2]))[[type]], v[,1], v[,2]), nc, nc)
    rownames(res) <- colnames(res) <- colnames(x)
    res
}

minmax_scaler <- function(x) {
  
    
   return( ( x - min( x ) )  / ( max(x) - min(x) ) ) 
}


robust_scaler <- function(x){
  
  return( ( x - quantile( x , 0.5) )  / ( quantile(x ,0.75) - quantile(x, 0.25) ) )
  
}


ml_error <-  function(model_name = "Linear Regression Model",model_predictions){
  MAE <- model_predictions %>%
    yardstick::mae( actual,predictions)

  MAPE <- model_predictions %>%
    yardstick::mape( actual,predictions)

  RMSE <- model_predictions %>%
    yardstick::rmse( actual, predictions)

  data.frame(Model_Name = model_name, MAE= round(MAE$.estimate,3), MAPE = round(MAPE$.estimate,3), RMSE = round(RMSE$.estimate, 3))  
  
} 


# TimeSeries cross validation function
cross_validation <- function(data_training, kfold , Model_name, model){
  
mae_list  <- c()
mape_list <- c()
rmse_list <- c() 
  
for (k in seq(kfold ,1)){
  print(paste("kfolds number", k))
    
  validation_start_date <- max(data_train$date) - as.difftime(k*6*7, units = "days")
    
  validation_end_date <- max(data_train$date) - as.difftime((k-1)*6*7, units = "days")
    
  data_training <- data_train %>% 
    filter(date < validation_start_date)
    
  data_validation <- data_train %>% 
    filter(date >= validation_start_date & date <= validation_end_date)
    
  lm_fit_cv <- model %>% fit(sales ~ . , data = data_training)
    
  lm_pred_cv <- lm_fit_cv %>% predict(data_validation) %>% 
    bind_cols(data_validation$sales) %>% 
    rename(predictions = ".pred", actual = "...2")
    
  lm_result_cv <- ml_error("Linear Regression Model",lm_pred_cv) 
    
  
    # store performance of each kfold iteration
  mae_list [k] <- unlist( lm_result_cv['MAE']   , use.names=FALSE) 
  mape_list[k] <- unlist( lm_result_cv['MAPE']  , use.names=FALSE )
  rmse_list[k] <- unlist( lm_result_cv['RMSE']  , use.names=FALSE)
    
  
    
  }
  
 return( tibble( Model_name = Model_name,
                 MAE = paste( round(mean(mae_list),2)," +/- ",   round(sd(mae_list),  2)),
                 MAPE = paste( round(mean(mape_list),2)," +/- ", round(sd(mape_list), 2)),
                 RMSE = paste( round(mean(rmse_list),2)," +/- ", round(sd(rmse_list), 2))) )  
  
}

Reading the data

# Dataset of Sales
df_sales_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/train.csv", stringsAsFactors = T,  na.strings = c("","'","NA"))

# Dataset of Stores
df_store_raw <- data.table::fread("/home/renato/repos/Rossmann/inst/Data/store.csv", stringsAsFactors = T,  na.strings=c("","'","NA"))

# merge
df_raw <- merge(df_sales_raw, df_store_raw, by= "Store")

Descricption of Data

Rename Columns

#making a copy of the dataset df_raw
df1 <- df_raw %>% 
  clean_names()

rmarkdown::paged_table(head(df1))

Data Dimensions

print(paste("Number of Rows: " ,nrow(df_raw)))
[1] "Number of Rows:  1017209"
print(paste("Number of Cols: " ,ncol(df_raw)))
[1] "Number of Cols:  18"

Data Types

# converting the "date" feature to datetime
df1$date <- ymd(df1$date)

glimpse(df1)
Rows: 1,017,209
Columns: 18
$ store                        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day_of_week                  <int> 5, 4, 3, 2, 1, 7, 6, 5, 4, 3, 2, 1, 7, 6…
$ date                         <date> 2015-07-31, 2015-07-30, 2015-07-29, 201…
$ sales                        <int> 5263, 5020, 4782, 5011, 6102, 0, 4364, 3…
$ customers                    <int> 555, 546, 523, 560, 612, 0, 500, 459, 50…
$ open                         <int> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1…
$ promo                        <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ state_holiday                <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ school_holiday               <int> 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ store_type                   <fct> c, c, c, c, c, c, c, c, c, c, c, c, c, c…
$ assortment                   <fct> a, a, a, a, a, a, a, a, a, a, a, a, a, a…
$ competition_distance         <int> 1270, 1270, 1270, 1270, 1270, 1270, 1270…
$ competition_open_since_month <int> 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9…
$ competition_open_since_year  <int> 2008, 2008, 2008, 2008, 2008, 2008, 2008…
$ promo2                       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ promo2since_week             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ promo2since_year             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ promo_interval               <fct> , , , , , , , , , , , , , , , , , , , , …

Checking NA

colSums(is.na(df1))
                       store                  day_of_week 
                           0                            0 
                        date                        sales 
                           0                            0 
                   customers                         open 
                           0                            0 
                       promo                state_holiday 
                           0                            0 
              school_holiday                   store_type 
                           0                            0 
                  assortment         competition_distance 
                           0                         2642 
competition_open_since_month  competition_open_since_year 
                      323348                       323348 
                      promo2             promo2since_week 
                           0                       508031 
            promo2since_year               promo_interval 
                      508031                            0 

Fillout NA

# removing missing values
df1 <- df1 %>% 
  mutate(
    
    # replace the missing values with the value of 200000
    competition_distance = ifelse(is.na(competition_distance), 200000, competition_distance),
    
    # replace the missing values with the month in the date column     
    competition_open_since_month = ifelse(is.na(competition_open_since_month), month(date), competition_open_since_month),
    
    # replace the missing values with the year in the date column      
    competition_open_since_year = ifelse(is.na(competition_open_since_year), year(date), competition_open_since_year),
    
    # replace the missing values with the week in the date column      
    promo2since_week = ifelse(is.na(promo2since_week ), week(date), promo2since_week ),
    
    # replace the missing values with the year in the date column      
    promo2since_year = ifelse(is.na(promo2since_year), year(date), promo2since_year),
         
    month_map = month(date),
         
    month_map = month.abb[month_map])


# removing the blanks 
df1$promo_interval <- str_squish(df1$promo_interval)

# replacing missing values with 0
df1$promo_interval[df1$promo_interval==""] = "0"

# creating a column with the months that the promotion is active with value 1 and not active with value 0
df1 <- df1 %>% 
  mutate(is_promo = ifelse(promo_interval == "0",0,ifelse(str_detect(promo_interval, month_map),1,0)))

# viewing a random sample of the data

kable(t(df1[sample(nrow(df1), 5), ])) %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
store 619 918 185 558 912
day_of_week 4 3 4 5 3
date 2015-01-22 2014-12-10 2013-05-30 2015-06-12 2013-03-13
sales 3599 4828 8186 2364 4386
customers 385 549 574 357 648
open 1 1 1 1 1
promo 0 0 1 0 0
state_holiday 0 0 0 0 0
school_holiday 0 0 0 0 0
store_type a a d a c
assortment a c c a c
competition_distance 1600 18710 1860 3000 3100
competition_open_since_month 6 4 5 2 5
competition_open_since_year 2006 2015 2015 2010 2010
promo2 1 0 0 0 0
promo2since_week 45 50 22 24 11
promo2since_year 2009 2014 2013 2015 2013
promo_interval Feb,May,Aug,Nov 0 0 0 0
month_map Jan Dec May Jun Mar
is_promo 0 0 0 0 0

Change Types

df1 <- df1 %>% 
  mutate(competition_distance = as.integer(competition_distance),
         month_map = as.factor(month_map))

Descriptive Statistics

# selecting only numeric features
num_attributes <- df1 %>% 
  keep(is.numeric)

# selecting only categorical features
cat_attributes <- df1 %>% 
  keep(is.factor)

Numeric Attributes

# Central Tendency  - mean , median
num_mean <- as.data.frame( t(lapply(num_attributes, mean)))

num_median <- as.data.frame( t(lapply(num_attributes, median)))

# dispersion - std, min, max, range, skew, kurtosis
num_std <- as.data.frame( t(lapply(num_attributes, sd)))

num_min <- as.data.frame( t(lapply(num_attributes, min)))

num_max <- as.data.frame( t(lapply(num_attributes, max)))

num_skew <- as.data.frame( t(lapply(num_attributes, skewness)))

num_kurt <- as.data.frame( t(lapply(num_attributes, kurtosis)))

table_desc <- t(bind_rows(num_min,num_max,num_mean,num_median,num_std,num_skew,num_kurt))

table_desc<- as.data.frame(table_desc)


names(table_desc) <- c("min","max","mean","median","std","skew", "kurtosis")


kable(table_desc, digits = 4) %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
min max mean median std skew kurtosis
store 1 1115 558.4297 558 321.9087 -0.0009548772 -1.200527
day_of_week 1 7 3.998341 4 1.997391 0.001592818 -1.246877
sales 0 41551 5773.819 5744 3849.926 0.6414577 1.778351
customers 0 7388 633.1459 609 464.4117 1.598646 7.091712
open 0 1 0.8301067 1 0.3755392 -1.758039 1.090703
promo 0 1 0.3815145 0 0.4857586 0.4878364 -1.762017
school_holiday 0 1 0.1786467 0 0.3830564 1.677837 0.8151368
competition_distance 20 200000 5935.443 2330 12547.65 10.24231 147.7887
competition_open_since_month 1 12 6.786849 7 3.311087 -0.04207551 -1.232611
competition_open_since_year 1900 2015 2010.325 2012 5.515593 -7.235636 124.0704
promo2 0 1 0.5005638 1 0.4999999 -0.002255189 -1.999997
promo2since_week 1 53 23.69512 22 14.353 0.1829283 -1.179229
promo2since_year 2009 2015 2012.793 2013 1.662658 -0.7844339 -0.2100855
is_promo 0 1 0.1718349 0 0.3772371 1.739833 1.027021
df1 %>% 
  ggplot(aes(sales))+
  geom_histogram(aes(y =..density..),col="black", fill="steelblue")+
  stat_function(fun = dnorm, args = list(mean = mean(df1$sales), sd = sd(df1$sales)), col="red", lwd=1)

Categorical Attributes

apply(cat_attributes, 2, function(x) length(unique(x)))
state_holiday    store_type    assortment     month_map 
            4             4             3            12 
boxplot_01 <- df1 %>% 
  filter(state_holiday != 0 & sales > 0 ) %>% 
  ggplot(aes(x = state_holiday, y = sales, fill=state_holiday))+
  scale_y_continuous(breaks = seq(0,40000,5000))+
  geom_boxplot()



boxplot_02 <- df1 %>% 
  filter(state_holiday != 0 & sales > 0 ) %>% 
  ggplot(aes(x = store_type, y = sales, fill=store_type))+
  scale_y_continuous(breaks = seq(0,40000,5000))+
  geom_boxplot()

boxplot_03 <- df1 %>% 
  filter(state_holiday != 0 & sales > 0 ) %>% 
  ggplot(aes(x = assortment, y = sales, fill=assortment))+
  scale_y_continuous(breaks = seq(0,40000,5000))+
  geom_boxplot()

grid.arrange(boxplot_01,boxplot_02,boxplot_03,nrow= 2,ncol=2)

Feature Engineering

df2 <- df1

Mindmap Hypotheses

Mindmap Hypotheses

Mindmap Hypotheses

Creation of Hypotheses

1. Lojas com número maior de funcionários deveriam vender mais.
1. Stores with more employees should sell more.

2. Lojas com maior capacidade de estoque deveriam vender mais.
2. Stores with greater inventory capacity should sell more.

3. Lojas com maior porte deveriam vender mais.
3. Larger stores should sell more.

4. Lojas com maior sortimentos deveriam vender mais.
4. Stores with larger assortments should sell more.

5. Lojas com competidores mais próximos deveriam vender menos.
5. Stores with closer competitors should sell less.

6. Lojas com competidores à mais tempo deveriam vendem mais.
6. Stores with longer competitors should sell more.

2.2.2. Products Hypotheses

1. Lojas que investem mais em Marketing deveriam vender mais.
1. Stores that invest more in Marketing should sell more.

2. Lojas com maior exposição de produto deveriam vender mais.
2. Stores with greater product exposure should sell more.

3. Lojas com produtos com preço menor deveriam vender mais.
3. Stores with lower priced products should sell more.

4. Lojas com promoções mais agressivas ( descontos maiores ), deveriam vender mais.
4. Stores with more aggressive promotions (bigger discounts), should sell more.

5. Lojas com promoções ativas por mais tempo deveriam vender mais.
5. Stores with active promotions for longer should sell more.

6. Lojas com mais dias de promoção deveriam vender mais.
6. Stores with more promotion days should sell more.

7. Lojas com mais promoções consecutivas deveriam vender mais.
7. Stores with more consecutive promotions should sell more.

2.2.3. Time Hypotheses

1. Lojas abertas durante o feriado de Natal deveriam vender mais.
1. Stores open during the Christmas holiday should sell more.

2. Lojas deveriam vender mais ao longo dos anos.
2. Stores should sell more over the years.

3. Lojas deveriam vender mais no segundo semestre do ano.
3. Stores should sell more in the second half of the year.

4. Lojas deveriam vender mais depois do dia 10 de cada mês.
4. Stores should sell more after the 10th of each month.

5. Lojas deveriam vender menos aos finais de semana.
5. Stores should sell less on weekends.

6. Lojas deveriam vender menos durante os feriados escolares.
6. Stores should sell less during school holidays.

2.3. Final List of Hypotheses

1. Lojas com maior sortimentos deveriam vender mais.
1. Stores with larger assortments should sell more.

2. Lojas com competidores mais próximos deveriam vender menos.
2. Stores with closer competitors should sell less.

3. Lojas com competidores à mais tempo deveriam vendem mais.
3. Stores with longer competitors should sell more.

4. Lojas com promoções ativas por mais tempo deveriam vender mais.
4. Stores with active promotions for longer should sell more.

5. Lojas com mais dias de promoção deveriam vender mais.
5. Stores with more promotion days should sell more.

6. Lojas com mais promoções consecutivas deveriam vender mais.
6. Stores with more consecutive promotions should sell more.

7. Lojas abertas durante o feriado de Natal deveriam vender mais.
7. Stores open during the Christmas holiday should sell more.

8. Lojas deveriam vender mais ao longo dos anos.
8. Stores should sell more over the years.

9. Lojas deveriam vender mais no segundo semestre do ano.
9. Stores should sell more in the second half of the year.

10. Lojas deveriam vender mais depois do dia 10 de cada mês.
10. Stores should sell more after the 10th of each month.

11. Lojas deveriam vender menos aos finais de semana.
11. Stores should sell less on weekends.

12. Lojas deveriam vender menos durante os feriados escolares.
12. Stores should sell less during school holidays.

Feature Engineering

df2 <- df2 %>% 
  mutate(
    
    # Extracting year
    year = as.integer(year(date)),
    
    # Extracting month
    month = month(date),
    
    # Extracting day
    day = day(date),
    
    #Extracting week of the year
    week_of_year = week(date),
    
    # Extracting year and week
    year_week = strftime(date, format = "%Y-%W"),
    
    # Extracting first day of the month
    first_day_of_month = "01",
    
    # Turning into Date
    competition_since = make_date(competition_open_since_year, competition_open_since_month,first_day_of_month),
    
    # Getting the difference in days
    competition_time_month = as.integer(difftime(date, competition_since, units = "days")/30)
        
        )


# this is function to convert year-week to year-month-day
data_da_semana <- function(ano, semana, diadasemana){
  w <- paste0(ano, "-W", sprintf("%02d", semana), "-", diadasemana)
  ISOweek2date(w)-1
}

df2 <- df2 %>% 
  mutate(
    
    # convert year-week to year-month-day
    promo_since = data_da_semana(promo2since_year, promo2since_week, 1),
         
    # Getting the difference in days
    promo_time_week = difftime(date, promo_since, units = "days")/7,
         
    # converting to integer
    promo_time_week = as.integer(promo_time_week),
         
    assortment = case_when(
                          # changing from a to basic  
                           assortment == "a" ~ "basic",
                          
                           # changing from b to extra 
                           assortment == "b" ~ "extra",
                                
                           # everything else for extended
                           T ~ "extended"),
    
    state_holiday = case_when(
                           # changing from a to public_holiday  
                           state_holiday == "a" ~ "public_holiday",
                                   
                           # changing from b to easter_holiday
                           state_holiday == "b" ~ "easter_holiday",
                                   
                           # changing from b to christmas
                           state_holiday == "c" ~ "christmas",
                           
                           # everything else for regular_day     
                           T ~ "regular_day"))

Variable filtering

df3 <- df2

Rows Filtering

# Removing the records of the days the stores are closed and thus obtaining only sales with values> 0
df3 <- 
  df3 %>% 
  filter(open != 0 & sales > 0)

Columns Filtering

# Removing features that were not available at the time of production and will not be needed.
df3 <- df3 %>% 
  select(-customers, -open, -promo_interval, -month_map)

Exploration Data Analysis

df4 <- df3

Univariate Analysis

Variable Response

df4 %>% 
  ggplot(aes(sales))+
  geom_histogram(col="black", fill="steelblue")+
  scale_x_continuous(breaks = seq(0,40000, 2000))+
  labs(title = "count vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

O maior numero de vendas está a partir de 4000 a 9000 dollares.
The biggest number of sales is from 4000 to 9000 dollars.

Distribution of Numerical Variables

num_attributes %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram(col= "black", fill="steelblue", bins = 25)+
    scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  labs(title = "Distribution of numerical variables")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

Categorical Variables

barplot_state_holiday <- df4 %>% 
  filter(state_holiday != "regular_day") %>% 
  ggplot(aes(state_holiday, fill=state_holiday))+
  geom_bar(col="black")+
  labs(title= "count vs state_holiday")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))
  

density_state_holiday <- df4 %>% 
  filter(state_holiday != "regular_day") %>% 
  ggplot(aes(sales, fill= state_holiday))+
  geom_density(alpha= 0.4)+
  labs(title= "Distribution state holiday")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

barplot_store_type <- df4 %>% 
  ggplot(aes(store_type, fill=store_type))+
  geom_bar(col="black")+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  labs(title= "count vs store_type")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

density_store_type <- df4 %>% 
  ggplot(aes(sales, fill= store_type))+
  geom_density(alpha= 0.4)+
  labs(title= "Distribution store_type")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

barplot_assortment <- df4 %>% 
  ggplot(aes(assortment, fill=assortment))+
  geom_bar(col="black")+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE))+
  labs(title= "count vs assortment")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

density_assortment <- df4 %>% 
  ggplot(aes(sales, fill= assortment))+
  geom_density(alpha= 0.4)+
  labs(title= "Distribution assortment")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

grid.arrange(barplot_state_holiday, density_state_holiday,
             barplot_store_type, density_store_type,
             barplot_assortment, density_assortment,
             nrow= 3,ncol=2)

Bivariate Analysis

H1. Lojas com maior sortimentos deveriam vender mais.
H1. Stores with larger assortments should sell more.

Falsa Lojas com maior sortimento , vendem menos.
False Stores with greater assortment , sell less.

@ref(fig:distribution)

density_assortment +
  labs(title= "Distribution assortment")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

Visualizando a variável assortment para verificar se algum momento o sortimento extra foi maior.

Viewing the assortment variable to check if the extra assortment was ever greater.

ggarrange( 
  
  df4 %>%
  group_by(date, assortment) %>% 
  summarise_by_time(date, .by = "weeks",sales = sum(sales)) %>% 
  ungroup() %>% 
  ggplot(aes(date, sales))+
  geom_line(aes(col= assortment), lwd= 1)+
  scale_y_continuous(labels = scales::label_number_si())+
  scale_x_date(breaks = "2 month", minor_breaks = "1 week", date_labels = "%Y-%W")+
  labs(title= "year/week vs sales / assortment")+
  theme(plot.title = element_text(hjust = 0.5, size = 18)), 


  df4 %>%
    filter(assortment != "basic" & assortment != "extended") %>% 
    group_by(date, assortment) %>% 
    summarise_by_time(date, .by = "weeks",sales = sum(sales)) %>% 
    ungroup() %>% 
    ggplot(aes(date, sales))+
    geom_line(aes(col= assortment), lwd= 1)+
    scale_y_continuous(labels = scales::label_number_si())+
    scale_x_date(breaks = "2 month", minor_breaks = "1 week", date_labels = "%Y-%W")+
    labs(title= "year/week vs sales / extra")+
    theme(plot.title = element_text(hjust = 0.5, size = 18)), ncol = 1) 

Há uma diferença de escala entre basic/extended para extra, por isso plotarei o sortimento extra sozinho.

There is a difference in scale between basic / extended to extra, so I will plot the extra assortment myself.

H2. Lojas com competidores mais próximos deveriam vender menos.
H2. Stores with closer competitors should sell less.

Falsa Lojas com competidores mais próximos , vendem mais.
False Stores with closest competitors , sell more.

label <- c("0-1000m", "1000-2000m","2000-3000m","3000-4000m","4000-5000m",
           "5000-6000m", "6000-7000m","7000-8000m","8000-9000m","9000-10000m",
           "10000-11000m", "11000-12000m","12000-13000m","13000-14000m","14000-15000m",
           "15000-16000m", "16000-17000m","17000-18000m","18000-19000m","19000-20000m")

df4$competition_dstance_binned <- cut(df4$competition_distance, breaks = seq(0, 20000, 1000), labels = label)

fig4 <- df4 %>% 
  drop_na(competition_dstance_binned) %>%
  group_by(competition_dstance_binned) %>% 
  summarise(sales = sum(sales), .groups = 'drop') %>% 
  ggplot(aes(competition_dstance_binned, sales, fill= competition_dstance_binned)) +
  geom_bar(stat = "identity", col="black")+
  theme(axis.text.x = element_text(angle = 90), legend.position = "none")+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "Distribution competition_distance vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

fig5 <- df4 %>% 
  group_by(competition_distance) %>% 
  summarise(sales= sum(sales), .groups="drop") %>% 
  ggplot(aes(x = competition_distance, sales))+
  geom_point( shape=21, fill="steelblue", size=3, color="white")+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "competition_distance vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

grid.arrange(fig4, fig5, nrow= 2,ncol=1)

df4 %>% 
  select(competition_distance, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table Competition_distance vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table Competition_distance vs Sales
competition_distance sales
competition_distance 1.0000000 -0.0415451
sales -0.0415451 1.0000000

H3. Lojas com competidores à mais tempo deveriam vendem mais.
H3. Stores with longer competitors should sell more.

Falsa Lojas com competidores á mais tempo , vendem menos.
False Stores with competitors for longer , sell less.

ggarrange(

  df4 %>% 
  group_by(competition_time_month) %>% 
  summarise(sales= sum(sales), .groups = "drop") %>%
  filter(competition_time_month < 120 & competition_time_month != 0) %>%
  ggplot(aes(competition_time_month, sales)) +
  geom_point(col="steelblue", size= 1)+
  geom_smooth( formula = "y~x",method = "lm", se = FALSE, color= "red")+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "competition_time_month vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18)),

df4 %>% 
  group_by(competition_time_month) %>% 
  summarise(sales= sum(sales), .groups = "drop") %>%
  filter(competition_time_month < 120 & competition_time_month != 0) %>% 
 
  plot_time_series(competition_time_month,sales, .interactive = F, .title = "Competition_time_month Vs Sales",
                   .x_lab= "competition_time_month",
                   .y_lab="sales",
                   .line_color = "red", 
                   .line_size= 1)+
  scale_y_continuous(labels = scales::label_number_si())+
  scale_x_continuous(breaks = seq(-40,120,10))+
  labs(title= "competition_time_month vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))
,ncol=1 )

df4 %>% 
  select(competition_time_month, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table Competition_time_month vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table Competition_time_month vs Sales
competition_time_month sales
competition_time_month 1.0000000 -0.0034788
sales -0.0034788 1.0000000

H4. Lojas com promoções ativas por mais tempo deveriam vender mais.
H4. Stores with active promotions for longer should sell more.

Falsa Lojas com promoções ativas por mais tempo , vendem menos.
False Stores with active promotions longer , sell less.

fig8 <- aux1 %>% 
  group_by(promo_time_week) %>% 
  summarise(sales = sum(sales)) %>% 
  filter(promo_time_week < 0) %>% 
  ggplot(aes(promo_time_week, sales))+
  geom_bar(stat='identity', fill="steelblue", col="black")+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "promo_time_week < 0 vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

fig9 <- aux1 %>% 
  group_by(promo_time_week) %>% 
  summarise(sales = sum(sales)) %>% 
  filter(promo_time_week > 0) %>% 
  ggplot(aes(promo_time_week, sales))+
  geom_bar(stat='identity', fill="steelblue", col="black")+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "promo_time_week > 0 vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

fig14 <- aux1 %>% 
  group_by(promo_time_week) %>% 
  summarise(sales = sum(sales)) %>% 
  filter(promo_time_week > 0) %>% 
  ggplot(aes(promo_time_week, sales))+
  geom_point(shape=21, fill="steelblue", size=3, color="white")+
  scale_y_continuous(labels = scales::label_number_si())+
  geom_smooth( formula = "y~x",method = "lm", se = FALSE, color= "red")+
  labs(title= "promo_time_week vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))
  

grid.arrange(fig8, fig9,fig14, nrow= 3,ncol=1)

aux1 %>% 
  select(promo_time_week, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table promo_time_week vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table promo_time_week vs Sales
promo_time_week sales
promo_time_week 1.0000000 -0.0572574
sales -0.0572574 1.0000000

H5. Lojas com mais promoções consecutivas deveriam vender mais.
H5. Stores with more consecutive promotions should sell more.

Falsa Lojas com mais promoções consecutivas , vendem menos.
False Stores with more promotions consecutive , sell less.

df4 %>% 
  group_by(promo, promo2) %>% 
  summarise(sales= sum(sales)) %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
promo promo2 sales
0 0 1482612096
0 1 1289362241
1 0 1628930532
1 1 1472275754
pp <- ggplot( ) + aes(x= date) + geom_line(lwd=1,data= df4 %>% filter(promo == 1 & promo2 ==1) %>% group_by(date) %>% 
  summarise_by_time(date, .by = "weeks",sales = sum(sales)),aes(y= sales, col= "Tradicional & Extendida", group = "Tradicional & Extendida"))+
  geom_line(lwd=1,data= df4 %>% filter(promo == 1 & promo2 ==0) %>% group_by(date) %>% 
  summarise_by_time(date, .by = "weeks",sales = sum(sales)),aes(y= sales, col= "Extendida", group = "basic"))+
  scale_y_continuous(labels = scales::label_number_si())+
  scale_x_date(breaks = "2 month", minor_breaks = "1 week", date_labels = "%Y-%W")+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  labs(x= "year week")+
  labs(title= "year/week vs sales ")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))
  
pp

H6. Lojas abertas durante o feriado de Natal deveriam vender mais.
H6. Stores open during the Christmas holiday should sell more.

Falsa Lojas abertas durante o feriado de Natal , vendem menos.
False Stores opened during the Christmas holiday , sell less.

fig10 <- df4 %>% 
  filter(state_holiday != "regular_day") %>% 
  ggplot(aes(year , sales,fill=state_holiday))+
  geom_bar(stat='identity', position=position_dodge())+
  labs(title= "count vs state_holiday")+
  theme(plot.title = element_text(hjust = 0.5, size = 18), legend.position="none")
 

grid.arrange(barplot_state_holiday, fig10, nrow= 2,ncol=1)

H7. Lojas deveriam vender mais ao longo dos anos.
H7. Stores should sell more over the years.

Falsa Lojas vendem menos , ao longo dos anos.
False Stores sell less, over the years.

df4 %>% 
  group_by(year) %>% 
  summarise(sales=sum(sales)) %>% 
  ggplot(aes(year, sales))+
  geom_line(col="steelblue", lwd=1)+
  scale_x_continuous(breaks = c(2013,2014,2015))+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "year vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

df4 %>% 
  select(year, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table year vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table year vs Sales
year sales
year 1.0000000 0.0361515
sales 0.0361515 1.0000000

H8. Lojas deveriam vender mais no segundo semestre do ano.
H8. Stores should sell more in the second half of the year.

Falsa Lojas vendem menos , no segundo semestre do ano.
False Stores sell less, in the second half of the year.

df4 %>% 
  group_by(month) %>% 
  summarise(sales=sum(sales)) %>% 
  ggplot(aes(month, sales))+
  geom_line(col="darkgreen", lwd=1)+
  scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8,9,10,11,12))+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title = "month vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

df4 %>% 
  select(month, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table month vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table month vs Sales
month sales
month 1.0000000 0.0735887
sales 0.0735887 1.0000000

H9. Lojas deveriam vender mais depois do dia 10 de cada mês.
H9. Stores should sell more after the 10th of each month.

Verdadeira Lojas vendem mais , depois do dia 10 de cada mes.
True Stores sell more, after the 10th of each month..

fig10 <- df4 %>% 
  group_by(day) %>% 
  summarise(sales=sum(sales)) %>% 
  ggplot(aes(day, sales))+
  geom_line(col="darkgreen", lwd=1)+
  scale_y_continuous(labels = scales::label_number_si())+
  scale_x_continuous(breaks = seq(0,31, 1))+
  labs(title= "day vs sales")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

fig11 <- df4 %>% 
  mutate(day = ifelse(day <= 10 , "before_10_days", "after_10_days")) %>% 
  ggplot(aes(sales, fill=day))+
  geom_density(alpha= 0.4)+
  labs(title= "Distribution sales / before_10_days / after_10_days ")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

grid.arrange(fig10,fig11, nrow= 2,ncol=1)

df4 %>% 
  select(day, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table day vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table day vs Sales
day sales
day 1.0000000 -0.0518648
sales -0.0518648 1.0000000

H10. Lojas deveriam vender menos aos finais de semana.
H10. Stores should sell less on weekends.

Verdadeira Lojas vendem menos , nos finais de semana.
True Stores sell less, on weekends.

df4 %>% 
  group_by(day_of_week) %>% 
  summarise(sales=sum(sales)) %>% 
  ggplot(aes(day_of_week, sales))+
  geom_line(col="darkred", lwd=1)+
  scale_y_continuous(labels = scales::label_number_si())+
  scale_x_continuous(breaks = seq(1,7, 1))+
  labs(title= "day_of_week vs sales ")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

df4 %>% 
  select(day_of_week, sales) %>% 
  cor(method = "pearson") %>% 
  kable(caption = "Correlation Table day_of_week vs Sales") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"),html_font = "Cambria")
Correlation Table day_of_week vs Sales
day_of_week sales
day_of_week 1.0000000 -0.1787531
sales -0.1787531 1.0000000

H11. Lojas deveriam vender menos durante os feriados escolares.
H11. Stores should sell less during school holidays.

Verdadeira Lojas vendem menos , durante os feriadso escolares, except os meses de Julho e Agosto.
True Stores sell less, during school holidays, except July and August.

fig12 <- df4 %>%
  mutate(school_holiday = as.factor(school_holiday)) %>% 
  ggplot(aes(sales, fill=school_holiday))+
  geom_density(alpha= 0.4)+
  labs(title= "Distribution sales / school_holiday ")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

fig13 <- df4 %>% 
  group_by(month, school_holiday) %>% 
  summarise(sales= sum(sales)) %>%
  mutate(month = as.factor(month),
         school_holiday = as.factor(school_holiday)) %>% 
  ggplot(aes(month , sales,fill=school_holiday))+
  geom_bar(stat='identity', position=position_dodge(), col="black")+
  scale_y_continuous(labels = scales::label_number_si())+
  labs(title= "month vs sales / school_holiday ")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

grid.arrange(fig12,fig13, nrow= 2,ncol=1)

Summary of Hypotheses

kable(data.frame( Hypotheses = c("H1","H2","H3","H4","H5","H6","H7", "H8", "H9","H10","H11")  , 
                  Conclusions = c("False","False","False","False", "False","False","False", "True","True", "True", "True") , 
                  Relevance = c("Medium","Low","Low","High","Medium","Low","Low", "Low","Low","Low","Low") )
      , caption ="Hypothesis Summary Table" ) %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend"),html_font = "Cambria")
Hypothesis Summary Table
Hypotheses Conclusions Relevance
H1 False Medium
H2 False Low
H3 False Low
H4 False High
H5 False Medium
H6 False Low
H7 False Low
H8 True Low
H9 True Low
H10 True Low
H11 True Low

Multivariate Analysis

Numerical Attributes

df4 %>% 
  keep(is.numeric) %>% 
  cor() %>% 
  ggcorrplot::ggcorrplot(hc.order = T,
             type = "lower",
             lab=T,
             lab_size = 3,
             method = "square",
             colors = c("chocolate1","white","darkcyan"),
             ggtheme = theme_minimal())

Categorical Attributes

df4 %>%
  mutate(store_type = as.character(store_type)) %>% 
  keep(is.character) %>% 
  select(-first_day_of_month, -year_week) %>%
  as.data.frame() %>% 
  catcor(type="cramer") %>% 
  ggcorrplot::ggcorrplot(hc.order = T,
             type = "lower",
             lab=T,
             lab_size = 3,
             method = "square",
             colors = c("chocolate1","white","steelblue"),
             ggtheme = theme_minimal())

df4 <- df3

Data Preparation

df5 <- df4

Rescaling

df5 <- df5 %>% 
  mutate(competition_distance = robust_scaler(competition_distance),
         competition_time_month = robust_scaler(competition_time_month),
         promo_time_week = minmax_scaler(promo_time_week),
         year= minmax_scaler(year))

Transformation

Encoding

# state_holiday -  One Hot Encoding
one_hot_state_holiday <- df5 %>% 
  select(state_holiday) %>% 
  fastDummies::dummy_cols()

# store_type - Label Encoding
df5$store_type <-  df5 %>% 
  select(store_type) %>% 
  data.matrix() %>% 
  as.data.frame()

# assortment - Ordinal Encoding
df5 <- df5 %>%
  mutate(assortment= recode_factor(assortment,"basic"="1",
                                              "extra"="2",
                                              "extended"="3",.ordered = TRUE))

df5 <- cbind(df5, one_hot_state_holiday)

df5 <- df5 %>% 
  select( - state_holiday)

Variable Response Transformation

df5 <- df5 %>% 
  mutate(sales = log1p(sales))
df5 %>% 
  ggplot(aes(sales))+
  geom_histogram(aes(y =..density..),col="black", fill="steelblue")+
  stat_function(fun = dnorm, args = list(mean = mean(df5$sales), sd = sd(df5$sales)), col="red", lwd=1)

Nature Transformation

df5 <- df5 %>% 
  mutate(day_of_week_sin = round(sin(day_of_week*(2. * pi/7)),2),
         day_of_week_cos = round(cos(day_of_week*(2. * pi/7)),2),
         month_sin = round(sin(month*(2. * pi/12)),2),
         month_cos = round(cos(month*(2. * pi/12)),2),
         day_sin = round(sin(day*(2. * pi/30)),2),
         day_cos = round(cos(day*(2. * pi/30)),2),
         week_of_year_sin = round(sin(week_of_year*(2. * pi/52)),2),
         week_of_year_cos = round(cos(week_of_year*(2. * pi/52)),2),
        )

df5 <- df5 %>% 
  select(- day_of_week, -month, -day, - week_of_year, - promo_since, -competition_since)

Feature Selection

df6 <- df5

Selection of features with Boruta

#boruta <- Boruta(sales ~.,data = X_train ,doTrace =2 )

Manual Feature Selection

cols_selected_boruta <- c("store","store_type","promo", "assortment", "competition_distance", "competition_open_since_month",
    "competition_open_since_year","promo2","promo2since_week","promo2since_year", "competition_time_month",
    "promo_time_week", "day_of_week_sin", "day_of_week_cos", "month_sin","month_cos", "day_sin", "day_cos",
    "week_of_year_sin", "week_of_year_cos")

cols_selected_boruta_full <- c("store","store_type","promo", "assortment", "competition_distance", "competition_open_since_month",
    "competition_open_since_year","promo2","promo2since_week","promo2since_year", "competition_time_month",
    "promo_time_week", "day_of_week_sin", "day_of_week_cos", "month_sin","month_cos", "day_sin", "day_cos",
    "week_of_year_sin", "week_of_year_cos", "sales", "date")

Reading the data

# Dataset of Sales
df5 <- read_csv("/home/renato/repos/Rossmann/inst/Data/data_modelling.csv")

Manual Feature Selection

df6 <-  as_tibble(df5) %>% 
  select(-X1)
# Boruta selected features
cols_selected_boruta_full <- c("store","store_type","promo", "assortment", "competition_distance", "competition_open_since_month",
    "competition_open_since_year","promo2","promo2since_week","promo2since_year", "competition_time_month",
    "promo_time_week", "day_of_week_sin", "day_of_week_cos", "month_sin","month_cos", "day_sin", "day_cos",
    "week_of_year_sin", "week_of_year_cos", "sales", "date")

Machine Learning Modelling

df6 <- df6 %>% 
  select(cols_selected_boruta_full) %>% 
  mutate(sales = expm1(sales))


# Splitthe set in training and testing
data_split <- df6 %>%  time_series_split(assess = "6 weeks", cumulative = T)

# Dataset Train
data_train <- training(data_split)

# Dataset Test
data_test <- testing(data_split)

rec <- recipe(sales~.,data_train) 

# Selected metrics 
mt <- metric_set(yardstick::rmse, yardstick::mae, yardstick::mape)

Average Model

aux1 <- data_test %>% 
  select(store, sales)

aux2 <- data_test %>% 
  group_by(store) %>% 
  summarise(predictions = mean(sales))
  
avg_model <- aux1 %>% 
  left_join(aux2, by= "store") %>% 
  select(sales, predictions) %>% 
  rename(actual = "sales")

baseline_result <- ml_error("Average Model",avg_model)

baseline_result
     Model_Name      MAE   MAPE     RMSE
1 Average Model 1386.121 21.937 1836.357

Linear Regression Model

# Create Linear Regresion Model
#lm <-
#  linear_reg() %>% 
#  set_engine("lm") %>% 
#  set_mode("regression")
 
# Training Model
#lm_fit <- lm %>%
#  fit(sales ~ . , data = data_train)

# Preditions
#lm_pred <- lm_fit %>% 
#  predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")
  
# Evaluate
#lm_result <- ml_error("Linear Regression Model",lm_pred)

# Save pickle of results
#saveRDS(lm_result,"Resultados_Modelos/lm_result.rds")

lm_result <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/lm_result.rds")

lm_result %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Model_Name MAE MAPE RMSE
Linear Regression Model 1929.022 32.561 2661.223

Linear Regression Model - Cross Validation

#lm_result_cv <- cross_validation(data_training , 5, "Linear Regression Model Cross Validation", lm)

# Save pickle of results
#saveRDS(lm_result_cv,"Resultados_Modelos/lm_result_cv.rds")

lm_result_cv <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/lm_result_cv.rds")

lm_result_cv
# A tibble: 1 x 4
  Model_name                       MAE              MAPE         RMSE           
  <chr>                            <chr>            <chr>        <chr>          
1 Linear Regression Model Cross V… 2120.97  +/-  2… 33.16  +/- … 2914.3  +/-  4…

Random Forest Model

# Create Non-linear Model RandomForest
#rf <-
#  rand_forest(trees = 100) %>% 
#  set_engine("ranger") %>% 
#  set_mode("regression")

# Training Model
#rf_fit <- rf %>%
#  fit(sales ~ ., data = data_train)


# Preditions
#rf_pred <- rf_fit %>% 
#   predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")


#rf_result <- ml_error("Random Forest Model",rf_pred)  

# Save pickle of results
#saveRDS(rf_result,"Resultados_Modelos/rf_result.rds")

rf_result <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/rf_result.rds")

rf_result %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Model_Name MAE MAPE RMSE
Random Forest Model 934.618 15.369 1294.868

Random Forest Model - Cross Validation

#rf_result_cv <- cross_validation(data_training , 5, "Random Forest Model Cross Validation", rf)

# Save pickle of results
#saveRDS(rf_result_cv,"Resultados_Modelos/rf_result_cv.rds")

rf_result_cv <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/rf_result_cv.rds")

rf_result_cv
# A tibble: 1 x 4
  Model_name                      MAE             MAPE          RMSE            
  <chr>                           <chr>           <chr>         <chr>           
1 Random Forest Model Cross Vali… 1159.87  +/-  … 17.61  +/-  … 1611.5  +/-  50…

Xgboosting Model

# Create Model
#xg <-
#  boost_tree(trees = 100) %>% 
#  set_engine("xgboost") %>% 
#  set_mode("regression")

# Training Model
#xg_fit <- xg %>%
#  fit(sales ~ ., data = data_train)


# Preditions
#xg_pred <- xg_fit %>% 
#  predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")

xg_result <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/xg_result.rds")

# Evaluate
#xg_result <- ml_error("Xgboosting Model",xg_pred)

# Save pickle of results
#saveRDS(xg_result,"Resultados_Modelos/xg_result.rds")

xg_result %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Model_Name MAE MAPE RMSE
Xgboosting Model 931.904 15.319 1276.638
result_xg_cv <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/result_xg_cv.rds")

#result_xg_cv <- cross_validation(data_training , 5, "Xgboosting Model Cross Validation", xg)

# Save pickle of results
#saveRDS(result_xg_cv,"Resultados_Modelos/result_xg_cv.rds")

result_xg_cv
# A tibble: 1 x 4
  Model_name                   MAE               MAPE           RMSE            
  <chr>                        <chr>             <chr>          <chr>           
1 Xgboosting Model Cross Vali… 1127.08  +/-  19… 16.97  +/-  2… 1531.06  +/-  2…

Comparing Model Performances

Single Performances

bind_rows(baseline_result, lm_result, rf_result, xg_result) %>% 
  arrange(RMSE) %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Model_Name MAE MAPE RMSE
Xgboosting Model 931.904 15.319 1276.638
Random Forest Model 934.618 15.369 1294.868
Average Model 1386.121 21.937 1836.357
Linear Regression Model 1929.022 32.561 2661.223

Real Performance - Cross Validation

bind_rows( lm_result_cv, rf_result_cv, result_xg_cv) %>% 
  arrange(RMSE) %>% 
  kable() %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Model_name MAE MAPE RMSE
Xgboosting Model Cross Validation 1127.08 +/- 196.56 16.97 +/- 2.53 1531.06 +/- 244.39
Random Forest Model Cross Validation 1159.87 +/- 380 17.61 +/- 5.83 1611.5 +/- 504.44
Linear Regression Model Cross Validation 2120.97 +/- 252.78 33.16 +/- 3.15 2914.3 +/- 429.76

HYPERPARAMETER FINE TUNING

Creating parameter grid

#xgb_spec <- boost_tree(
#  trees = 1000, 
#  tree_depth = tune(), min_n = tune(), 
#  loss_reduction = tune(),                     
#  sample_size = tune(), mtry = tune(),         
#  learn_rate = tune(),                         
#) %>% 
#  set_engine("xgboost") %>% 
#  set_mode("regression")


#xgb_grid <- grid_latin_hypercube(
  
  
#  tree_depth(),
#  min_n(),
#  loss_reduction(),
#  sample_size = sample_prop(),
#  finalize(mtry(), data_train),
#  learn_rate(),
#  size = 10
#)

Tuning Xgboost Model

#xgb_wf <- workflow() %>%
#  add_formula(sales ~ .) %>%
#  add_model(xgb_spec)#

#set.seed(234)

#all_cores <- parallel::detectCores(logical = FALSE)


#cl <- makePSOCKcluster(all_cores - 1)
#registerDoParallel(cl)


#xgb_res <- tune_grid(
#  xgb_wf,
#  resamples = cv,
#  grid = xgb_grid,metrics = mt,
#  control = control_grid(save_pred = TRUE)
#)

Results and Metrics

#xgb_res<- readRDS("Resultados_Modelos/xgb_res.rds")

#xgb_res %>%
#  collect_metrics() %>%
#  filter(.metric == "rmse") %>%
#  select(mean, mtry:sample_size) %>%
#  pivot_longer(mtry:sample_size,
#               values_to = "value",
#               names_to = "parameter"
#  ) %>%
#  ggplot(aes(value, mean, color = parameter)) +
#  geom_point(alpha = 0.8, show.legend = FALSE) +
#  facet_wrap(~parameter, scales = "free_x") +
#  labs(x = NULL, y = "rmse")

Viewing parameters for RMSE metric

#best_rmse <- select_best(xgb_res, "rmse")
#best_rmse

Encapsulating the final parameters

#final_xgb <- finalize_workflow(
#  xgb_wf,
#  best_rmse
#)

#final_xgb
#all_cores <- parallel::detectCores(logical = FALSE)


#cl <- makePSOCKcluster(all_cores - 1)
#registerDoParallel(cl)

#final_res <- last_fit(final_xgb, data_split, metrics = mt)


#saveRDS(final_res,"Resultados_Modelos/final_res.rds")

#final_res <- readRDS("Resultados_Modelos/final_res.rds")

#collect_metrics(final_res)

Final Model

# Create Final Model
#xg <-
#  boost_tree(trees = 3000, 
#             tree_depth = 5, 
#             min_n = 3, 
#             sample_size = 0.7, 
#             learn_rate = 0.03, 
#             mtry = 7, 
#             loss_reduction = 0.03) %>% 
#  set_engine("xgboost") %>% 
#  set_mode("regression")

Viewing the results of the final model

#all_cores <- parallel::detectCores(logical = FALSE)


#cl <- makePSOCKcluster(all_cores - 1)
#registerDoParallel(cl)

#xg_fit <- xg %>%
  #fit(sales ~ ., data = data_train)

#saveRDS(xg_fit,"Resultados_Modelos/xg_fit_final.rds")

xg_fit_final <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/xg_fit_final.rds")

# Preditions
#xg_pred <- xg_fit %>% 
#  predict(data_test) %>% 
#  bind_cols(data_test$sales) %>% 
#  rename(predictions = ".pred", actual = "...2")

#saveRDS(xg_pred,"Resultados_Modelos/xg_pred_final.rds")

xg_pred_final <- readRDS("/home/renato/repos/Rossmann/inst/Resultados_Modelos/xg_pred_final.rds")

# Evaluate
xg_result_final <- ml_error("Xgboosting Final Model",xg_pred_final)

xg_result_final %>% 
   kable() %>% 
   kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Model_Name MAE MAPE RMSE
Xgboosting Final Model 848.912 13.49 1176.609

Business Error Interpretation and Translation

df7 <- data_test

df7$predictions <- xg_pred_final$predictions

Business Performance

# Total estimated revenue per store.
df71 <- df7 %>% 
  group_by(store) %>% 
  summarise(predictions = sum(predictions), .groups="drop")

# Mean absolute error and Mean absolute percentage error per store
df72 <- df7 %>%
  group_by(store) %>%
  summarise(mae= Metrics::mae(sales, predictions),
            mape = Metrics::mape(sales, predictions)*100, .groups="drop")

# Merging total forecasted revenue with errors
df73 <- df71 %>% 
  inner_join(df72, by="store")

# Creating the best and worst scenario
df73 <- df73 %>% 
  mutate(worst_scenario = predictions - mae,
         best_scenario = predictions + mae) %>% 
  select(store, predictions, worst_scenario, best_scenario, mae, mape) %>% 
  arrange(-mape)

rmarkdown::paged_table(head(df73))
# Creating 2 classes to better visualize where the model made predictions more easily and others more difficult.
df73 <- df73 %>% 
  mutate(difficulty = ifelse(mape <= 30,"normal_pred", "hard_pred"))

# Creating scatter plot store vs map
difficult_pred <- df73 %>% 
  ggplot(aes(store, mape)) + geom_point(aes(shape= difficulty ,col= difficulty), size=2.5, alpha=0.4)+ scale_y_continuous(breaks = seq(0,70,10))+
  scale_shape_manual(values=c(10, 16))+
  scale_x_continuous(breaks = seq(0,1500, 100))+
  labs(title= "store vs mape")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))

# Transforming into plotly.
ggplotly(difficult_pred)

No gráfico acima é possivel ver em vermelho as lojas que o algoritimo teve mais dificuldade em prever.
In the graph above it is possible to see in red the stores that the algorithm had more difficulty in predicting.

Total Performance

# Predictions in all stores in the next 6 weeks.
df73 %>% 
  summarise(predictions= formattable::currency(sum(predictions), big.mark = ",", symbol = "R$"),
            worst_scenario= formattable::currency(sum(worst_scenario), big.mark = ",", symbol = "R$"),
            best_scenario = formattable::currency(sum(best_scenario), big.mark = ",", symbol = "R$")) %>% 
  kable(caption = "Predictions in all stores in the next 6 weeks") %>% 
  kable_styling(full_width = F, bootstrap_options = c("striped", "hover", "condesend", "responsive"), html_font = "Cambria")
Predictions in all stores in the next 6 weeks
predictions worst_scenario best_scenario
R$283,870,334.50 R$282,919,399.55 R$284,821,269.45

Machine Learning Performance

# Creating the error and error_rate features
df7 <- df7 %>% 
  mutate(error = sales - predictions,
         error_rate = predictions / sales)
ggarrange(
  
  df7 %>%
  summarise_by_time(date, .by = "day",error_rate = mean(error_rate)) %>% 
  ggplot(aes(date, error_rate))+
  geom_line(lwd=1, col="steelblue")+
  geom_hline(yintercept=1, linetype="dashed", color = "red", lwd=0.8)+
  scale_y_continuous(breaks = seq(0, 2, 0.05))+
  labs(title = "Error_rate vs Date")+  
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),plot.title = element_text(hjust = 0.5, size = 18)),

  df7 %>% 
  group_by(date) %>% 
  summarise_by_time(date,.by = "day", sales = sum(sales), predictions = sum(predictions), .groups= "drop") %>% 
  ggplot( ) + aes(x= date) + geom_line(lwd=1,aes(y= sales, col= "sales", group = "sales"))+
  geom_line(lwd=1,aes(y= predictions, col= "predictions", group = "predictions"))+
  scale_y_continuous(labels = scales::label_number_si())+
  scale_x_date(breaks = "1 weeks", minor_breaks = "1 day", date_labels = "%Y-%m-%d")+
  labs(title= "Date vs Sales/Predictions")+
  theme(plot.title = element_text(hjust = 0.5, size = 18)),

  df7 %>% 
  ggplot(aes(predictions, error))+
  geom_point(alpha=0.4, shape=21, fill="darkgreen", size=2, color="white")+
  scale_x_continuous(breaks = seq(0,30000,2000))+  
  labs(title= "Residual Plot")+
  theme(plot.title = element_text(hjust = 0.5, size = 18)),
  
  df7 %>% 
  ggplot(aes(error))+
  geom_histogram(aes(y =..density..),col="black", fill="chocolate")+
  stat_function(fun = dnorm, args = list(mean = mean(df7$error), sd = sd(df7$error)), col="red", lwd=1)+
  labs(title = "Distribution Error")+
  theme(plot.title = element_text(hjust = 0.5, size = 18))+
  scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) ,nrow  =4)

- Erro vs Data: Valores acima da linha vermelha , são dias em que modelo fez uma superestimação, e valore abaixo da linha vermelha são dias em que modelo fez uma substimação.

  • Error_rate vs Date: Values above the red line are days when the model overestimated, and values below the red line are days when the model underestimated.
MPE <- mean((df7$sales - df7$predictions)/df7$sales )

MPE
[1] -0.04422607

Olhando a métrica MPE podemos ver que modelo está superestimando, pois valor é negativo, com isso podemos dizer que o modelo está prevendo melhor valores acima do real.

Looking at the MPE metric we can see which model is overestimating, because the value is negative, with that we can say that the model is better predicting values above the real.

  • Data vs Vendas / Predições: Podemos ver que as predições estão bem proximas dos valores reais , e tambem podemos ver os locais onde as erro nas predições são maiores.

  • Date vs Sales/Predictions: We can see that the predictions are very close to the actual values, and we can also see the places where the errors in the predictions are greatest.

  • Distribuição do Erro: Através desse gráfico é possivel ver que erro está bem próximo de uma distribuição normal.

  • Distribution Error: Through this graph it is possible to see which error is very close to a normal distribution.

  • Predições vs Erro: Neste gráfico de residuos podemos ver que sempre que fizermos predições entre 7000 e 10000, o modelo irá errar mais.

  • Predictions vs Error: In this residual plot we can see that whenever we make predictions between 7000 and 10000, the model will make more mistakes.